import csv
import json
from pathlib import Path

def convert_csv_to_alpaca_jsonl():
    # 定义文件路径
    csv_file_path = Path('./zero_gen/merged_data.csv')
    jsonl_file_path = Path('./zero_gen/alpaca_data.jsonl')
    
    # 读取CSV文件并转换为JSONL格式
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file, \
         open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
        
        # 创建CSV读取器
        csv_reader = csv.DictReader(csv_file)
        
        # 处理每一行数据
        for row_num, row in enumerate(csv_reader, 1):
            # 创建Alpaca格式的数据
            alpaca_entry = {
                "instruction": row.get('prompt', '').strip(),
                "output": row.get('answer', '').strip()
            }
            
            # 写入JSONL文件
            jsonl_file.write(json.dumps(alpaca_entry, ensure_ascii=False) + '\n')
            
            # 每处理10000行打印一次进度
            if row_num % 10000 == 0:
                print(f"已处理 {row_num} 行数据")
    
    print(f"转换完成！已将CSV数据转换为Alpaca格式并保存到 {jsonl_file_path}")

if __name__ == "__main__":
    convert_csv_to_alpaca_jsonl()